import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as plt1
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.subplots import make_subplots
from sklearn.linear_model import LogisticRegression
dosbol=pd.read_csv("heart_failure_clinical_records_dataset.csv")
dosbol
| age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 75.0 | 0 | 582 | 0 | 20 | 1 | 265000.00 | 1.9 | 130 | 1 | 0 | 4 | 1 |
| 1 | 55.0 | 0 | 7861 | 0 | 38 | 0 | 263358.03 | 1.1 | 136 | 1 | 0 | 6 | 1 |
| 2 | 65.0 | 0 | 146 | 0 | 20 | 0 | 162000.00 | 1.3 | 129 | 1 | 1 | 7 | 1 |
| 3 | 50.0 | 1 | 111 | 0 | 20 | 0 | 210000.00 | 1.9 | 137 | 1 | 0 | 7 | 1 |
| 4 | 65.0 | 1 | 160 | 1 | 20 | 0 | 327000.00 | 2.7 | 116 | 0 | 0 | 8 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 294 | 62.0 | 0 | 61 | 1 | 38 | 1 | 155000.00 | 1.1 | 143 | 1 | 1 | 270 | 0 |
| 295 | 55.0 | 0 | 1820 | 0 | 38 | 0 | 270000.00 | 1.2 | 139 | 0 | 0 | 271 | 0 |
| 296 | 45.0 | 0 | 2060 | 1 | 60 | 0 | 742000.00 | 0.8 | 138 | 0 | 0 | 278 | 0 |
| 297 | 45.0 | 0 | 2413 | 0 | 38 | 0 | 140000.00 | 1.4 | 140 | 1 | 1 | 280 | 0 |
| 298 | 50.0 | 0 | 196 | 0 | 45 | 0 | 395000.00 | 1.6 | 136 | 1 | 1 | 285 | 0 |
299 rows × 13 columns
1.Linear regression is a method for predicting outcomes. Identifying the value of two coefficients aids in the creation of accurate forecasts. Linear Regression is simple to use and provides reliable predictions. Still, Logical Regression is similar to linear regression in that it assists in determining the values for two coefficients that balance each input variable. The sole difference between the two is that Logical Regression uses a logical, non-linear function instead of a linear function to solve binary classification issues. Unlike linear regression, logical regression may provide explanations for predictions. Regression problems are solved using linear regression while logistic regression can be used to solve both classification and regression problems, it is most commonly utilized to solve classification difficulties.
2.The response variable (y) is a random variable while the predictor variable (x) is assumed non-random or fixed and measured without error
3.We make data simpler to read and utilize by preparing it. This procedure removes data discrepancies or duplicates that may otherwise degrade the accuracy of a model. Data preparation also guarantees that no inaccurate or missing values are included due to human mistakes or defects.
4.When your model memorizes the data without truly understanding/interpolating a generic function that would apply to any external data, this is known as overfitting. In other words, it develops a function that is tuned primarily to the training dataset. Hence, it performs near-perfectly with the training data but fails miserably with data it hasn't seen before. When there isn't enough data for the model to create/interpolate a generic function that defines the process, this is known as underfitting. As a result, the model performs poorly for most input data.
a. Features:
b. Categoricals:
because anything that considered binary is CATEGORICAL,by categorical means small amount of value
c. Continuous:
Because anything that has finite amount of value will consider as continuous
In this Project, we have to predict when people will have mortality caused by heart failure. We will use 5 different plots to visualize and see what the data tells us. So, If we use only one model, we can get not high accuracy therefore we need to use 2 different models to compare each other to get a more accurate prediction about mortality caused by heart failure
dosbol.head()
| age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 75.0 | 0 | 582 | 0 | 20 | 1 | 265000.00 | 1.9 | 130 | 1 | 0 | 4 | 1 |
| 1 | 55.0 | 0 | 7861 | 0 | 38 | 0 | 263358.03 | 1.1 | 136 | 1 | 0 | 6 | 1 |
| 2 | 65.0 | 0 | 146 | 0 | 20 | 0 | 162000.00 | 1.3 | 129 | 1 | 1 | 7 | 1 |
| 3 | 50.0 | 1 | 111 | 0 | 20 | 0 | 210000.00 | 1.9 | 137 | 1 | 0 | 7 | 1 |
| 4 | 65.0 | 1 | 160 | 1 | 20 | 0 | 327000.00 | 2.7 | 116 | 0 | 0 | 8 | 1 |
dosbol.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 299 entries, 0 to 298 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 299 non-null float64 1 anaemia 299 non-null int64 2 creatinine_phosphokinase 299 non-null int64 3 diabetes 299 non-null int64 4 ejection_fraction 299 non-null int64 5 high_blood_pressure 299 non-null int64 6 platelets 299 non-null float64 7 serum_creatinine 299 non-null float64 8 serum_sodium 299 non-null int64 9 sex 299 non-null int64 10 smoking 299 non-null int64 11 time 299 non-null int64 12 DEATH_EVENT 299 non-null int64 dtypes: float64(3), int64(10) memory usage: 30.5 KB
dosbol.nunique()
age 47 anaemia 2 creatinine_phosphokinase 208 diabetes 2 ejection_fraction 17 high_blood_pressure 2 platelets 176 serum_creatinine 40 serum_sodium 27 sex 2 smoking 2 time 148 DEATH_EVENT 2 dtype: int64
# Checking for null values
dosbol.isnull().sum()
age 0 anaemia 0 creatinine_phosphokinase 0 diabetes 0 ejection_fraction 0 high_blood_pressure 0 platelets 0 serum_creatinine 0 serum_sodium 0 sex 0 smoking 0 time 0 DEATH_EVENT 0 dtype: int64
plt.figure(figsize=(20,10))
sns.boxplot(dosbol["ejection_fraction"])
plt.show()
plt.figure(figsize=(20,10))
sns.boxplot(dosbol["serum_creatinine"])
plt.show()
plt.figure(figsize=(20,10))
sns.boxplot(dosbol["time"])
plt.show()
# MY PIE CHART
New_sex = dosbol.copy(deep=True)
New_sex=dosbol.replace({'sex' : { 0 :"Female", 1 :"Male"}})
def label_function(val):
return f'{val / 100 * len(New_sex):.0f}\n{val:.0f}%'
fig, ax1 = plt.subplots(ncols=1, figsize=(10, 5))
New_sex.groupby('sex').size().plot(kind='pie',autopct=label_function,textprops={'fontsize': 20},
colors=['red', 'blue'], ax=ax1)
plt.title("GENDER DISCTIBUTION in Dataset")
plt.ylabel("PIE CHART")
plt.show()
# I GOT THIS CODE FROM INTERNET. I INSTALLED PLONTY LIBRARY
Male_Survived= dosbol[(dosbol["DEATH_EVENT"]==0) & (dosbol["sex"]==1)]
Male_Died = dosbol[(dosbol["DEATH_EVENT"]==1) & (dosbol["sex"]==1)]
Female_Survived = dosbol[(dosbol["DEATH_EVENT"]==0) & (dosbol["sex"]==0)]
Female_Died = dosbol[(dosbol["DEATH_EVENT"]==1) & (dosbol["sex"]==0)]
label1 = ["Male","Female"]
label2 = ['Male - Survived','Male - Died', "Female - Survived", "Female - Died"]
values1 = [(len(Male_Survived)+len(Male_Died)), (len(Female_Survived)+len(Female_Died))]
values2 = [len(Male_Survived),len(Male_Died),len(Female_Survived),len(Female_Died)]
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],)
fig.add_trace(go.Pie(labels=label1, values=values1, name="GENDER"),1, 1,)
fig.add_trace(go.Pie(labels=label2, values=values2, name="GENDER VS DEATH EVENT"),1, 2)
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(title_text="GENDER DISTRIBUTION IN THE DATASET \ GENDER VS DEATH EVENT",
# Add annotations in the center of the donut pies.
annotations=[dict(text='GENDER', x=0.19, y=0.5, font_size=10, showarrow=False ,font_color="white"),
dict(text='GENDER VS DEATH EVENT', x=0.84, y=0.5, font_size=9, showarrow=False,font_color="white")],
autosize=False,width=1200, height=500, paper_bgcolor="black",font_color="white")
fig.show()
As we can see in the first pie chart, our data has more Males than Females. Every 2/3 person in your data is identified as a Male, and in the second pie chart, we can see that Male Survive and Die more than females.
# I GOT THIS CODE FROM INTERNET. I INSTALLED PLONTY LIBRARY
Male_Survived= dosbol[(dosbol["anaemia"]==0) & (dosbol["sex"]==1)]
Male_Died = dosbol[(dosbol["anaemia"]==1) & (dosbol["sex"]==1)]
Female_Survived = dosbol[(dosbol["anaemia"]==0) & (dosbol["sex"]==0)]
Female_Died = dosbol[(dosbol["anaemia"]==1) & (dosbol["sex"]==0)]
label1 = ["Male","Female"]
label2 = ['Male - Anaemia','Male - Non-Anaemia', "Female - Anaemia", "Female - Non-Anaemia"]
values1 = [(len(Male_Survived)+len(Male_Died)), (len(Female_Survived)+len(Female_Died))]
values2 = [len(Male_Survived),len(Male_Died),len(Female_Survived),len(Female_Died)]
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],)
fig.add_trace(go.Pie(labels=label1, values=values1, name="ANAEMIA"),1, 1,)
fig.add_trace(go.Pie(labels=label2, values=values2, name="GENDER VS ANAEMIA"),1, 2)
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(title_text="ANAEMIA DISTRIBUTION IN THE DATASET \ GENDER VS ANAEMIA",
# Add annotations in the center of the donut pies.
annotations=[dict(text='GENDER', x=0.19, y=0.5, font_size=10, showarrow=False ,font_color="white"),
dict(text='GENDER VS ANAEMIA', x=0.84, y=0.5, font_size=9, showarrow=False,font_color="white")],
autosize=False,width=1200, height=500, paper_bgcolor="black",font_color="white")
fig.show()
As we can see in the first pie chart, our data has more Males than Females. Every 2/3 person in your data is identified as a Male, and in the second pie chart, we can see that Male are more Anemia and Non-Anemia than females.
# I GOT THIS CODE FROM INTERNET. I INSTALLED PLONTY LIBRARY
Male_Survived= dosbol[(dosbol["diabetes"]==0) & (dosbol["sex"]==1)]
Male_Died = dosbol[(dosbol["diabetes"]==1) & (dosbol["sex"]==1)]
Female_Survived = dosbol[(dosbol["diabetes"]==0) & (dosbol["sex"]==0)]
Female_Died = dosbol[(dosbol["diabetes"]==1) & (dosbol["sex"]==0)]
label1 = ["Male","Female"]
label2 = ['Male - Yes, Diabetes','Male - No, Diabetes', "Female - Yes, Diabetes", "Female -No, Diabetes"]
values1 = [(len(Male_Survived)+len(Male_Died)), (len(Female_Survived)+len(Female_Died))]
values2 = [len(Male_Survived),len(Male_Died),len(Female_Survived),len(Female_Died)]
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],)
fig.add_trace(go.Pie(labels=label1, values=values1, name="DIABETES"),1, 1,)
fig.add_trace(go.Pie(labels=label2, values=values2, name="DIABETES VS SEX"),1, 2)
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(title_text="DIABETES DISTRIBUTION IN THE DATASET \ DIABETES VS SEX",
# Add annotations in the center of the donut pies.
annotations=[dict(text='DIABETES', x=0.19, y=0.5, font_size=10, showarrow=False ,font_color="white"),
dict(text='DIABETES VS SEX', x=0.84, y=0.5, font_size=9, showarrow=False,font_color="white")],
autosize=False,width=1200, height=500, paper_bgcolor="black",font_color="white")
fig.show()
As we can see in the first pie chart, our data has more Males than Females. Every 2/3 person in your data is identified as a Male, and in the second pie chart, we can see that Males have more Diabetes than females.
# I GOT THIS CODE FROM INTERNET. I INSTALLED PLONTY LIBRARY
Male_Survived= dosbol[(dosbol["high_blood_pressure"]==0) & (dosbol["sex"]==1)]
Male_Died = dosbol[(dosbol["high_blood_pressure"]==1) & (dosbol["sex"]==1)]
Female_Survived = dosbol[(dosbol["high_blood_pressure"]==0) & (dosbol["sex"]==0)]
Female_Died = dosbol[(dosbol["high_blood_pressure"]==1) & (dosbol["sex"]==0)]
label1 = ["Male","Female"]
label2 = ['Male - Yes','Male - No', "Female - Yes", "Female -No"]
values1 = [(len(Male_Survived)+len(Male_Died)), (len(Female_Survived)+len(Female_Died))]
values2 = [len(Male_Survived),len(Male_Died),len(Female_Survived),len(Female_Died)]
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],)
fig.add_trace(go.Pie(labels=label1, values=values1, name="HIGH BLOOD PRESSURE "),1, 1,)
fig.add_trace(go.Pie(labels=label2, values=values2, name="HIGH BLOOD PRESSURE VS SEX"),1, 2)
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(title_text="HIGH BLOOD PRESSURE DISTRIBUTION IN THE DATASET \ HIGH BLOOD PRESSURE VS SEX",
# Add annotations in the center of the donut pies.
annotations=[dict(text='HIGH BLOOD PRESSURE ', x=0.19, y=0.5, font_size=10, showarrow=False ,font_color="white"),
dict(text='HIGH BLOOD PRESSURE VS SEX', x=0.84, y=0.5, font_size=9, showarrow=False,font_color="white")],
autosize=False,width=1200, height=500, paper_bgcolor="black",font_color="white")
fig.show()
As we can see in the first pie chart, our data has more Males than Females. Every 2/3 person in your data is identified as a Male, and in the second pie chart, we can see that Males have more high blood Pressure than females.
# I GOT THIS CODE FROM INTERNET. I INSTALLED PLONTY LIBRARY
Male_Survived= dosbol[(dosbol["smoking"]==0) & (dosbol["sex"]==1)]
Male_Died = dosbol[(dosbol["smoking"]==1) & (dosbol["sex"]==1)]
Female_Survived = dosbol[(dosbol["smoking"]==0) & (dosbol["sex"]==0)]
Female_Died = dosbol[(dosbol["smoking"]==1) & (dosbol["sex"]==0)]
label1 = ["Male","Female"]
label2 = ['Male - Yes','Male - No', "Female - Yes", "Female -No"]
values1 = [(len(Male_Survived)+len(Male_Died)), (len(Female_Survived)+len(Female_Died))]
values2 = [len(Male_Survived),len(Male_Died),len(Female_Survived),len(Female_Died)]
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],)
fig.add_trace(go.Pie(labels=label1, values=values1, name="SMOKING"),1, 1,)
fig.add_trace(go.Pie(labels=label2, values=values2, name="SMOKING VS SEX"),1, 2)
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(title_text="SMOKING DISTRIBUTION IN THE DATASET \ SMOKING VS DEATH EVENT",
# Add annotations in the center of the donut pies.
annotations=[dict(text='SMOKING', x=0.19, y=0.5, font_size=10, showarrow=False ,font_color="white"),
dict(text='SMOKING VS SEX', x=0.84, y=0.5, font_size=9, showarrow=False,font_color="white")],
autosize=False,width=1200, height=500, paper_bgcolor="black",font_color="white")
fig.show()
As we can see in the first pie chart, our data has more Males than Females. Every 2/3 person in your data is identified as a Male and in the second pie chart, we can see that Males and Females smoke almost equally.
plt.figure(figsize=(20,15))
plt.subplot(3,3,1)
plt.title('Distribution of Age ')
sns.distplot(dosbol.age)
plt.subplot(3,3,2)
plt.title('Distribution Anaemia ')
sns.distplot(dosbol.anaemia)
plt.subplot(3,3,3)
plt.title('Distribution of Creatinine Phosphokinase ')
sns.distplot(dosbol.creatinine_phosphokinase)
plt.subplot(3,3,4)
plt.title('Distribution of Diabetes ')
sns.distplot(dosbol.diabetes)
plt.subplot(3,3,5)
plt.title('Distribution of Ejection Fraction ')
sns.distplot(dosbol.ejection_fraction)
plt.subplot(3,3,6)
plt.title('Distribution of High Blood Pressure ')
sns.distplot(dosbol.high_blood_pressure)
plt.subplot(3,3,7)
plt.title('Distribution of Platelets ')
sns.distplot(dosbol.platelets)
plt.subplot(3,3,8)
plt.title('Distribution of Serum Creatinine ')
sns.distplot(dosbol.serum_creatinine)
plt.subplot(3,3,9)
plt.title('Distribution of Serum Sodium ')
sns.distplot(dosbol.serum_sodium)
plt.show()
plt.figure(figsize=(20,15))
plt.subplot(3,3,1)
plt.title('Distribution of Sex ')
sns.distplot(dosbol.sex)
plt.subplot(3,3,2)
plt.title('Distribution Smoking ')
sns.distplot(dosbol.smoking)
plt.subplot(3,3,3)
plt.title('Distribution of Time ')
sns.distplot(dosbol.time)
plt.subplot(3,3,4)
plt.title('Distribution of DEATH EVENT ')
sns.distplot(dosbol.DEATH_EVENT)
plt.show()
So we have created distribution plot to all our numerical variables to see their Densities
plt.figure(figsize=(20,30))
sns.heatmap(dosbol.corr(), annot=True,cmap="Blues")
plt.show()
This HeatMap tells that it does not have a Multicollinearity because the index of any of these cells is not higher than 0.5
plt.figure(figsize=(10,6))
X2 = dosbol["age"]
X1 = dosbol["time"]
sns.scatterplot(x=X2,y=X1,hue="DEATH_EVENT",data=dosbol)
plt.xlabel('AGE')
plt.ylabel('TIME')
plt.title('AGE vs TIME')
plt.grid()
As we can see in our Scatterplot above that more patients deceased during the follow-up period compared to who remained alive
Yes, it is necessary to scale the data since The data is scaled to make it easier for a model to learn. The benefits of scaling:
a. It speeds up the training process.
b. It prevents the optimization from becoming stuck in a local best-case scenario.
c. It improves the form of the error surface. and comprehend the situation.
We use pd.to_numeric(DATASET["X"],errors='coerce').notnull().all() to see whether variables numberic
1.We do not need to modify any variables
Difference between Parametric and Non-Parametric algorithms: Parametric model assumes that a probability distribution with a given set of parameters can accurately model the population Non-parametric model makes no assumptions about the probability distribution when modeling data. For the models you are choosing- are they parametric or nonparametric? Explain. I hava two Models: Linear Regression and Logistic Regression. Both my models are Parametric because both have a fixed number of parameters, computationally faster but make assumptions.
Label encoding converts a categorical variable into integers, with a number between 0 and num classes-1 depending on the number of classes. One-Hot replaces the category variable with num classes variables, of which one is 1 and the others are 0. When the variable is nominal and there is no order, we utilize one-hot. When there is a natural order to the data, we use label encoding.
X = dosbol.drop(labels='sex', axis=1)
y = dosbol['sex']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns)
scaled_df.head()
| age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | smoking | time | DEATH_EVENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.636364 | 0.0 | 0.071319 | 0.0 | 0.090909 | 1.0 | 0.290823 | 0.157303 | 0.485714 | 0.0 | 0.000000 | 1.0 |
| 1 | 0.272727 | 0.0 | 1.000000 | 0.0 | 0.363636 | 0.0 | 0.288833 | 0.067416 | 0.657143 | 0.0 | 0.007117 | 1.0 |
| 2 | 0.454545 | 0.0 | 0.015693 | 0.0 | 0.090909 | 0.0 | 0.165960 | 0.089888 | 0.457143 | 1.0 | 0.010676 | 1.0 |
| 3 | 0.181818 | 1.0 | 0.011227 | 0.0 | 0.090909 | 0.0 | 0.224148 | 0.157303 | 0.685714 | 0.0 | 0.010676 | 1.0 |
| 4 | 0.454545 | 1.0 | 0.017479 | 1.0 | 0.090909 | 0.0 | 0.365984 | 0.247191 | 0.085714 | 0.0 | 0.014235 | 1.0 |
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=1000)
logmodel.fit(X_train,y_train)
LogisticRegression(max_iter=1000)
predictions = logmodel.predict(X_test)
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_test,predictions))
confu_matrix=confusion_matrix(y_test,predictions)
print(confu_matrix)
precision recall f1-score support
0 0.33 0.09 0.15 32
1 0.64 0.90 0.75 58
accuracy 0.61 90
macro avg 0.49 0.50 0.45 90
weighted avg 0.53 0.61 0.53 90
[[ 3 29]
[ 6 52]]
logModel=LogisticRegression(max_iter=5000)
param_grid=[
{'penalty':['l1','l2','elasticnet','none'],
"C":np.logspace(-4,4,20),
'solver':['lbfgs','newton-cg','linlinear','sag','saga'],
'max_iter':[100,1000,2500,5000]}
]
from sklearn.model_selection import GridSearchCV
clf=GridSearchCV(logModel,param_grid=param_grid,cv=3,verbose=True,n_jobs=-1)
best_clf=clf.fit(X,y)
Fitting 3 folds for each of 1600 candidates, totalling 4800 fits
best_clf.best_estimator_
LogisticRegression(C=10000.0, solver='newton-cg')
print(f'Accuracy -: {best_clf.score(X,y):.3f}')
Accuracy -: 0.739
plt.figure(figsize=(20,10))
sns.heatmap(confu_matrix, annot = True, cmap = 'Blues')
plt.show()
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
LinearRegression()
predictions = lm.predict(X_test)
plt.scatter(y_test,predictions)
plt.show()
residuals = y_test - predictions
sns.histplot(residuals)
plt.show()
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
MAE: 0.38209573797461815 MSE: 0.19502415275502283 RMSE: 0.4416153900794478
from sklearn.metrics import r2_score
print('R2 Score:', r2_score(y_test, predictions))
R2 Score: 0.1488708850669802
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['sex'])
coeff_df
| sex | |
|---|---|
| age | 3.626001e-03 |
| anaemia | -1.350906e-02 |
| creatinine_phosphokinase | 4.981428e-05 |
| diabetes | -1.981516e-02 |
| ejection_fraction | -7.275452e-03 |
| high_blood_pressure | -6.465653e-02 |
| platelets | -7.840750e-07 |
| serum_creatinine | -1.594226e-02 |
| serum_sodium | -5.331056e-03 |
| smoking | 4.381830e-01 |
| time | -1.896022e-04 |
| DEATH_EVENT | -7.684708e-02 |
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
from sklearn import metrics
rmse = np.sqrt(metrics.mean_squared_error(y_test,pred))
rmse
0.4416153900794478
from sklearn.linear_model import LinearRegression
reg=LinearRegression().fit(X_train,y_train)
reg.score(X_test,y_test)
0.1488708850669802
reg.score(X_train,y_train)
0.284100984847228
from sklearn import linear_model
lasso_reg=linear_model.Lasso(alpha=50,max_iter=100,tol=0.1)
lasso_reg.fit(X_train,y_train)
Lasso(alpha=50, max_iter=100, tol=0.1)
1+lasso_reg.score(X_test,y_test)
0.9797749919328422
1-lasso_reg.score(X_train,y_train)
0.9714192847653921
from sklearn.linear_model import Ridge
ridge_reg=linear_model.Lasso(alpha=50,max_iter=100,tol=0.1)
ridge_reg.fit(X_train,y_train)
Lasso(alpha=50, max_iter=100, tol=0.1)
1+ridge_reg.score(X_test,y_test)
0.9797749919328422
1-ridge_reg.score(X_train,y_train)
0.9714192847653921